# coding: utf-8
import numpy as np
from scipy import stats
import random
from keras.models import load_model
import codecs
from keras.datasets import imdb
from keras.models import Sequential
from keras.layers import Dense, Dropout
from keras.layers import LSTM, Bidirectional
from keras.layers.embeddings import Embedding
from keras.layers.convolutional import Conv1D
from keras.layers.convolutional import MaxPooling1D
from keras.models import model_from_json
from keras.preprocessing import sequence
from keras.preprocessing import text
import json
from sklearn import metrics
from sklearn.metrics import roc_curve, roc_auc_score, precision_recall_curve
from sklearn.metrics import confusion_matrix
from sklearn.metrics import average_precision_score, precision_recall_curve
from sklearn.metrics.ranking import _binary_clf_curve
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
####### optimizer start value need to be set properly
from keras import backend as K
K.set_image_dim_ordering('tf')
import pandas as pd
from datetime import datetime



from dependency import * ###### dependency files
#from data_helper import *
embedding_vecor_length = 128
max_review_length = 128

from itertools import chain

def isfloat(value):
	try:
		float(value)
		return True
	except ValueError:
		return False

	
### functions are from process_sherlock.py

def readWordPosFromFile(filepath):
	pos = codecs.open(filepath, 'r',encoding = "utf-8").readlines()
	d = {}
	for i,v in enumerate(pos):
		d[v.split(",")[0]] = i
	return d

def getNormalTextString(words):
	#if words:
	try:
		words = [x for x in words if x.split("_")[0] not in stop]
		words = [x.split("_")[0] for x in words if x.split("_")[1] in acceptlist] ## 只取特定词性的词语
	except:
		words = [x for x in words if x not in stop]
		words = [x for x in words if x != " "]
	return words




def getWordSeq(words, wordpos):
	pos = []
	for k in words:## map words to position in 
		
		try:
			if k in wordpos:### if in the dictionary 
				#print k
				pos.append( wordpos[k])
			### is continue the standard solution?
			### or you have to use anothe rsolution such as
			else:
				pos.append(0)
				# continue
		except Exception as e:
			print(k.__class__,k, "error", e)

	return [pos]



import jieba
filename = os.path.join(dirname, "../supporting/high_frequency_protest_words.txt")
ff = codecs.open(filename, 'r', encoding = "utf-8").readlines()
# ff = codecs.open("/Users/han/Codes/CASM_replication_codes_AWS/supporting/high_frequency_protest_words.txt", 'r', encoding = "utf-8").readlines()
ff = [x.strip() for x in ff]
for k in ff:
	jieba.add_word(k)



def segmenter(text):
	if text:
		try:
			text = filtrate.sub(r' ', text)
			seg_list = jieba.cut(text, cut_all=False)
			seg_list = [l for l in seg_list if not l.isspace()]
			seg_list = getNormalTextString(seg_list)

			if seg_list:
				return " ".join(seg_list)
			else:
				return " "
		except:
			return " "
	else:
		return " "
		

		
## give a list of string of segmented text: "中国 人民"
## produce a list of word sequences (each word sequence is a string)
def string2sequence(s, wordpos):
	if isfloat(s):
		return [0] * embedding_vecor_length
	else:
		seg_list = s.split()
		if seg_list:
			X = getWordSeq(seg_list, wordpos)
			X = np.array(X)
			## sequences: list of lists where each element is a sequence
			X = sequence.pad_sequences(X, maxlen=128)
			return  list(chain(*X))
		else:
			return  [0] * embedding_vecor_length
			


## give a list of string of segmented text: "中国 人民"
## produce a list of word sequences (each word sequence is a string)
def sequence2string(s, wordpos):
	rl = []
	for i in s:
		if i!= 0 and i in wordpos:
			rl.append(wordpos[i])
			
	# return " ".join([wordpos[i] for i in s if i!= 0 and i in s])
	return " ".join(rl)

	
		
# strings2sequences("中国 人民".decode('utf-8'))